High dimensional single-cell analysis reveals iNKT cell developmental trajectories and effector fate decision

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scrublet as scr
import doubletdetection
import warnings
warnings.filterwarnings('ignore')

import logging
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.WARNING) 

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
scanpy==1.4.4.post1 anndata==0.6.22.post1 umap==0.3.10 numpy==1.17.3 scipy==1.4.1 pandas==0.25.3 scikit-learn==0.21.3 statsmodels==0.10.2 python-igraph==0.7.1 louvain==0.6.1
In [2]:
sc.settings.set_figure_params(dpi=130)
In [3]:
ko = sc.read_h5ad('./output/ko.preprocessing.h5ad')
ko.shape
Out[3]:
(4184, 13720)
In [4]:
#hvgs = adata_ko.var['highly_variable'].intersection(adata_wt.var['highly_variable'])
sc.pp.highly_variable_genes(ko, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(ko)
np.sum(ko.var['highly_variable'])
extracting highly variable genes
    finished (0:00:01)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
Out[4]:
1783
In [5]:
sc.pp.scale(ko, max_value=10)
In [6]:
sc.tl.pca(ko, svd_solver='arpack')
computing PCA with n_comps = 50
computing PCA on highly variable genes
    finished (0:00:00)
In [7]:
sc.pl.pca_variance_ratio(ko, log=True)
In [8]:
sc.pp.neighbors(ko)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    'distances', distances for each pair of neighbors
    'connectivities', weighted adjacency matrix (0:00:02)
In [9]:
sc.tl.umap(ko)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:12)
In [10]:
sc.tl.louvain(ko, resolution=1.2)
sc.pl.umap(ko, color=['louvain'])
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 11 clusters and added
    'louvain', the cluster labels (adata.obs, categorical) (0:00:00)
In [11]:
mean_cellType = np.empty((len(ko.obs['louvain'].unique()), ko.raw.shape[1]), 
                           dtype=float, order='C')
In [12]:
raw_adata = ko.raw.X.toarray()
In [13]:
for i in range(0, len(ko.obs['louvain'].unique())):
    #print(adata.obs['phenograph'].unique()[i])
    mean_cellType[i,:] = np.mean(raw_adata[ko.obs['louvain'] == ko.obs['louvain'].unique()[i], :], axis = 0)
In [14]:
mean_df = pd.DataFrame(np.corrcoef(mean_cellType), index = ko.obs['louvain'].unique(), columns = ko.obs['louvain'].unique())
In [15]:
import seaborn
ax = seaborn.clustermap(mean_df, cmap="jet").fig.suptitle('louvain') 
In [16]:
sc.tl.rank_genes_groups(ko, 'louvain', method='wilcoxon')
sc.pl.rank_genes_groups(ko, n_genes=25, sharey=False)
ranking genes
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:05)
In [17]:
pd.DataFrame(ko.uns['rank_genes_groups']['names']).head(50)
Out[17]:
0 1 2 3 4 5 6 7 8 9 10
0 Izumo1r Nkg7 AW112010 Ptma Fcer1g Naca Emb Wls Ptma 2600014E21Rik Lef1
1 Cmtm7 Klrb1c H2-K1 Stmn1 Nkg7 Ifi27l2a Pxdc1 Id2 Stmn1 Pdcd1 H3f3a
2 Smpdl3a Klrd1 H2-D1 Ran Cd7 Tcf7 Ckb Ltb 2810417H13Rik Plac8 Sox4
3 S100a10 Wls Ltb H2afz S100a6 Plac8 Blk Malat1 Top2a Nrgn Itm2a
4 Ptprcap Id2 B2m Cks1b Klrd1 Id3 Il1r1 Gm26740 Tubb5 Zbtb16 Gm43352
5 Ctsd Cd7 Shisa5 Hmgn2 Gzmb Tesc Fam83a Gimap6 Hmgb2 Lef1 Myb
6 Slamf6 H2-K1 Ms4a4b Ybx1 Crip1 Igfbp4 S100a4 Gimap4 H2afz Id3 Hmgn1
7 Srgn Ms4a4b Ctsw Hmgb2 Klra9 Ramp1 Sptssa Btg1 Dut Lztfl1 Ldhb
8 Drosha Mbnl1 Stat1 2700094K13Rik Ahnak Eif3h Lmo4 AW112010 Smc2 Icos Cd28
9 Gdi2 AW112010 Malat1 Anp32b Ccl5 Lef1 Ramp1 H2-D1 Hist1h1b Snrpe Cd27
10 Gimap4 Il2rb Ctsd Tesc Klre1 Rgcc Tmem176a H2-K1 Mki67 Mrpl52 Malat1
11 Icos Malat1 Rgs1 Eif5a Klrk1 Ass1 Rexo2 Ctsw Tuba1b Izumo1r Tox
12 Rac2 Styk1 Isg20 Ppia Klf2 Satb1 Il17re Izumo1r Anp32b Ccr9 Marcksl1
13 Ramp1 Xcl1 Ly6e Mif Ifngr1 Il6ra Capg Klrb1c Rrm2 Il4 Bcl2
14 S100a11 Ltb Ifngr1 Srsf2 Xcl1 Slamf6 Tmem176b Shisa5 Hist1h2ap Tcf7 Slc29a1
15 Arl4d Ctsw Btg1 Ranbp1 Ly6c2 Gm8730 Sepp1 Gimap1 Ezh2 Ramp1 Hivep3
16 Limd2 Gzmb Ifit3 Tuba1b Selplg Zfos1 Tnfrsf25 Tpt1 Ran Mgst2 Sept7
17 Eif3h H2-D1 Nkg7 Erh Anxa2 Zbtb16 Thy1 Ctsd Cks1b Hsp90ab1 Id3
18 Sh2d1a Hcst Isg15 Srsf7 Ms4a4b Snhg6 Il7r Cxcr3 Lmnb1 Smpdl3a Rap1a
19 Cd28 Sh3bgrl3 H2-Q7 Dut Tmsb4x Dpp4 Serpinb1a Ypel3 Dek Ckb Slamf6
20 Ly6e Fgl2 Rtp4 Txn1 Fgl2 Btf3 Rorc B2m Tmpo Il17rb Rasgrp1
21 Cd3d Efhd2 Sh3bgrl3 Ldha Klrc2 Ccr7 Ap3s1 Hcst Gmnn Eef1b2 Cnn3
22 Il18r1 Bcl2 Ifit1 Hspd1 Gzma Hsp90ab1 Gpx1 Gimap3 Asf1b Eef2 Sub1
23 Eif3f Lpar6 Gimap4 Igfbp4 Efhd2 Limd2 Avpi1 Ms4a4b Tyms Rgs10 Egr2
24 Gmfg Ptpn6 Rgs2 Prdx1 Hcst Sec61b Znrf1 Mbnl1 Hist1h1e Tox Cytip
25 Tpt1 H2-Q7 Hcst Nme1 S100a4 Eef1b2 Aqp3 Btg2 Hist1h2ae Nos1 Stmn1
26 Laptm5 Klrk1 Fgl2 Hnrnpab Ostf1 Gpx1 Il23r Ogt Fbxo5 Eef1g Tuba1a
27 Samd3 Ctla2a Mndal Uqcrq Itgb1 Hspe1 Sdc1 Ptprcap Smc4 Nav2 Ier3ip1
28 Tspan32 Ptprc Samd9l Set Bcl2 Fam3c Gm2a Nkg7 Tipin Fam3c Cd2
29 Zbtb16 Cd3g Ms4a6b Cox7a2 Vim Emb Rgcc Rsrp1 Cdk1 Rexo2 Ubb
30 Bcl2a1b Gimap3 Ifi203 Hsp90aa1 Klrb1c Slc25a5 Lgals3 Slfn2 Ranbp1 Slamf6 Spint2
31 Fam129a Klra9 Irf7 Hsp90ab1 Klra5 Bcl11b Icos Bcl2a1b Tk1 Cd247 Eif4g2
32 9530036M11Rik B4galnt1 H2-T22 Atp5j Ctsw Pabpc1 Nav2 Ly6e Clspn Slc29a1 Nrip1
33 Ptpn18 Ctsd Ifi47 Rgcc Emp3 Kcnn4 Akirin2 Lpar6 Hist1h4d Arhgap24 Supt4a
34 Ccnd2 Gimap6 Wls Snrpd1 Sh3bgrl3 Atp5e Lgals1 Ets1 Spc24 Ccng1 Tubb5
35 Ctsb Rabac1 Zbp1 2810417H13Rik Cd244 Gnb2l1 Slc6a13 Tbc1d4 Esco2 Npepl1 Camk4
36 Mxd4 Slamf7 Itm2b Lockd H2-K1 Psme2 S100a6 Cd27 Ybx1 Tnfsf11 Pebp1
37 Tecr Gimap4 Sp100 Anp32e AW112010 Mrpl52 F2r S100a11 Hnrnpab Csf1 Pdcd1
38 Naca B2m Pydc4 Cdca7 Ptprc Ccr9 Stk24 Il7r Hist1h1a Sepw1 Nrgn
39 Zfos1 Klrc2 Slfn2 Lmnb1 Styk1 Ppdpf Psap Cd3g Srsf2 Ifi27 Lcp1
40 Eef1b2 Cd160 Gbp7 Hnrnpa2b1 Cxcr6 Npm1 Ly6a Samd3 Atad2 Smco4 Psma6
41 Cd27 Il7r Fyb Hmgn1 Hsd11b1 Eif3i Tcf7 Ptpn6 Usp1 Il6ra Tex30
42 Tbc1d4 Shisa5 Gm4955 Psma7 Arl4c Cd5 Lrrc17 Gabarapl2 Cenpw Cst7 Cd84
43 Eef1a1 Ly6c2 Apobec3 Tipin Cd2 Eef1g Oaz1 Rbm39 Cbx5 Cd28 Dad1
44 Gimap5 Tpt1 Cxcr3 Phgdh Rabac1 Lztfl1 Camk2d Gimap5 Ube2s Gm8730 Tmsb10
45 Eef2 Cd2 Slfn8 Gapdh H2-Q7 Eif3k Prr13 Maf Ccna2 Cd5 Gsn
46 Lpcat4 Cxcr3 Dusp2 Dynll1 H2-D1 Ldha Lpcat3 H2-Q7 Snrpd1 Tgm3 Pfn1
47 Tox Cd52 Tpt1 Snrpf Malat1 Thy1 Plin3 Inpp4b Lig1 Gata3 Plgrkt
48 Btg1 Ifitm10 Lpar6 Hmgb1 Wls Lmo4 Ucp2 Ptpn18 Birc5 Ier3ip1 Bzw1
49 Aqp11 Gimap1 Xcl1 Slc25a5 Mbnl1 Pdlim1 Maf Cd3e Rrm1 Limd2 Ikzf1
In [18]:
sc.pl.umap(ko, color=['Mki67', 'Top2a', 'Ube2c'], color_map="jet") # Cycling
In [19]:
sc.pl.umap(ko, color=['Lef1','Itm2a'], color_map="jet") # NKT0
In [20]:
sc.pl.umap(ko, color=['Plac8','Icos'], color_map="jet") # NKT2
In [21]:
sc.pl.umap(ko, color=['Tmem176a', 'Emb'], color_map="jet") # NKT17
In [22]:
sc.pl.umap(ko, color=['Klrb1c','Il2rb'], color_map="jet") # NKT1
In [23]:
sc.pl.umap(ko, color=['Ifit1','Ifit3','Isg15'], color_map="jet") # NKT1d
In [24]:
sc.pl.umap(ko, color=['louvain'], legend_loc = 'on data', legend_fontsize = 6)
In [25]:
marker_genes = ['Lef1','Itm2a','Mki67','Top2a','Zbtb16','Plac8','Izumo1r','Ifit1','Ifit3','Isg15','Ly6a','Fhl2','Nkg7','Fcer1g','Gzma','Gzmb','Rorc','Serpinb1a','Tmem176a','Tmem176b']
ax = sc.pl.dotplot(ko, marker_genes, groupby='louvain')
In [53]:
new_cluster_names = {
    '0':'NKT2b', '1':'NKT1b', '2':'NKT1d', '3':'Cycling NKT', '4':'NKT1c',
    '5':'NKT2b','6':'NKT17', '7':'NKT1b', '8':'Cycling NKT', '9':'NKT2a',
    '10':'NKT0'}
In [54]:
vect = []
for i in range(0, len(ko.obs['louvain'])):
    vect = vect + [new_cluster_names[str(ko.obs['louvain'][i])]]
    
ko.obs['cell_type'] = vect
In [55]:
sc.pl.umap(ko, color='cell_type', title='', frameon=False)
... storing 'cell_type' as categorical
In [56]:
ko.obs['cell_type'].cat.categories
ko.obs['cell_type'].cat.reorder_categories(['NKT0','Cycling NKT','NKT17','NKT2a','NKT2b','NKT1b','NKT1c','NKT1d'], inplace = True)
ko.uns['cell_type_colors'] = [ '#A31E22', # NKT0
                                '#F3766E', # Cycling NKT 
                                '#2da9d2', # NKT17
                                '#FEC85A', # NKT2a
                                '#FAE600', # NKT2b
                                #'#50C878', # NKT1a
                                '#2E8B57', # NKT1b
                                '#0B6623', # NKT1c
                                '#4CBB17'] # NKT1d
In [57]:
sc.pl.umap(ko, color='cell_type', title='', frameon=False, save='_ko.ann.pdf')
WARNING: saving figure to file figures/umap_ko.ann.pdf
In [35]:
ko.shape
Out[35]:
(4184, 13720)
In [36]:
marker_genes = ['Lef1','Itm2a','Mki67','Top2a','Zbtb16','Plac8','Izumo1r','Ifit1','Ifit3','Isg15','Ly6a','Fhl2','Nkg7','Fcer1g','Gzma','Gzmb','Rorc','Serpinb1a','Tmem176a','Tmem176b']
ax = sc.pl.dotplot(ko, marker_genes, groupby='cell_type')
In [37]:
marker_genes = ['Blk', 'Rorc', 'Zbtb7b', 'Id2', 'Tbx21', 'Tox', 'Zbtb16', 'Gata3', 'E2f8', 'E2f2', 'Sox4','Lef1']
ax = sc.pl.dotplot(ko, marker_genes, groupby='cell_type')
In [38]:
sc.pl.matrixplot(ko, marker_genes, groupby='cell_type')
Out[38]:
GridSpec(2, 3, height_ratios=[0, 10.5], width_ratios=[3.84, 0, 0.2])
In [39]:
ko.write('./output/ko.ann.h5ad')

Saving cellbrowser

In [40]:
ko.obs['umap_1'] = pd.DataFrame(ko.obsm['X_umap']).iloc[:,0].tolist()
ko.obs['umap_2'] = pd.DataFrame(ko.obsm['X_umap']).iloc[:,1].tolist()
ko.obs.to_csv(path_or_buf = 'output/metadata.ko.tsv', sep = '\t', index = True)

sc.external.exporting.cellbrowser(ko, 
                                 'cellbrowser/ko', 
                                 'ko', 
                                  annot_keys=['cell_type', 'louvain', 'sample', 
                                              'n_counts', 'n_genes', 'percent_mito', 'percent_ribo'], 
                                 cluster_field='cell_type', nb_marker=30,
                                 skip_matrix=False, do_debug=True)
WARNING:root:cellbrowser/ko/cellbrowser.conf already exists. Overwriting existing files.
INFO:root:Writing scanpy matrix (4184 cells, 13720 genes) to cellbrowser/ko/exprMatrix.tsv.gz
INFO:root:Transposing matrix
INFO:root:Writing gene-by-gene, without using pandas
INFO:root:Writing 13720 genes in total
INFO:root:Wrote 0 genes
INFO:root:Wrote 2000 genes
INFO:root:Wrote 4000 genes
INFO:root:Wrote 6000 genes
INFO:root:Wrote 8000 genes
INFO:root:Wrote 10000 genes
INFO:root:Wrote 12000 genes
DEBUG:root:Compressing cellbrowser/ko/exprMatrix.tsv.gz.tmp
DEBUG:root:Running gzip -f cellbrowser/ko/exprMatrix.tsv.gz.tmp
DEBUG:root:Renaming cellbrowser/ko/exprMatrix.tsv.gz.tmp.gz to cellbrowser/ko/exprMatrix.tsv.gz
DEBUG:root:Couldnt find coordinates for ForceAtlas2, tried keys fa, X_fa and X_draw_graph_fa
DEBUG:root:Couldnt find coordinates for Fruchterman Reingold, tried keys fr, X_fr and X_draw_graph_fr
DEBUG:root:Couldnt find coordinates for Grid Fruchterman Reingold, tried keys grid_fr, X_grid_fr and X_draw_graph_grid_fr
DEBUG:root:Couldnt find coordinates for Kamadi Kawai, tried keys kk, X_kk and X_draw_graph_kk
DEBUG:root:Couldnt find coordinates for Large Graph Layout, tried keys lgl, X_lgl and X_draw_graph_lgl
DEBUG:root:Couldnt find coordinates for DrL Distributed Recursive Layout, tried keys drl, X_drl and X_draw_graph_drl
DEBUG:root:Couldnt find coordinates for Reingold Tilford tree, tried keys rt, X_rt and X_draw_graph_rt
DEBUG:root:Couldnt find coordinates for t-SNE, tried keys tsne, X_tsne and X_draw_graph_tsne
INFO:root:Writing UMAP coords to cellbrowser/ko/umap_coords.tsv
DEBUG:root:Couldnt find coordinates for PAGA/ForceAtlas2, tried keys pagaFa, X_pagaFa and X_draw_graph_pagaFa
DEBUG:root:Couldnt find coordinates for PAGA/Fruchterman-Reingold, tried keys pagaFr, X_pagaFr and X_draw_graph_pagaFr
DEBUG:root:Couldnt find coordinates for PHATE, tried keys phate, X_phate and X_draw_graph_phate
INFO:root:Writing cellbrowser/ko/markers.tsv
DEBUG:root:getting meta field: cell_type -> cell_type
DEBUG:root:getting meta field: louvain -> Louvain Cluster
DEBUG:root:getting meta field: sample -> sample
DEBUG:root:getting meta field: n_counts -> UMI Count
DEBUG:root:getting meta field: n_genes -> Expressed Genes
DEBUG:root:getting meta field: percent_mito -> Percent Mitochond.
DEBUG:root:getting meta field: percent_ribo -> percent_ribo
INFO:root:Generating cellbrowser/ko/quickGenes.tsv from cellbrowser/ko/markers.tsv
DEBUG:root:Reading cluster markers from cellbrowser/ko/markers.tsv
DEBUG:root:Separator for cellbrowser/ko/markers.tsv is '\t'
INFO:root:Reading cellbrowser/ko/markers.tsv: assuming marker file format (cluster, gene, score) + any other fields
DEBUG:root:Other headers: []
DEBUG:root:Other headers with type: []
DEBUG:root:score field has name 'z_score'
INFO:root:cellbrowser/ko/cellbrowser.conf already exists, not overwriting. Remove and re-run command to recreate.

Cycling cells

In [41]:
adata_ann = sc.read_h5ad('./output/ko.ann.h5ad')
adata_raw = sc.read_h5ad('./output/ko.preprocessing.h5ad')
adata_raw.shape
Out[41]:
(4184, 13720)
In [42]:
tokeep = []

for x in adata_ann.obs['cell_type']:
    tokeep = tokeep + [x in ['Cycling NKT']]

#keep_cells = adata_ann.obs['cell_type'] == ['Cycling NKT']
list_of_cell_names = adata_ann.obs_names[tokeep]
adata_cc = adata_raw[list_of_cell_names, ]
adata_cc.obs['cell_type'] = adata_ann.obs['cell_type'] 
adata_cc.shape
Trying to set attribute `.obs` of view, making a copy.
Out[42]:
(767, 13720)
In [43]:
sc.pp.highly_variable_genes(adata_cc, min_mean=0.25, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata_cc)
np.sum(adata_cc.var['highly_variable'])
extracting highly variable genes
    finished (0:00:00)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
Out[43]:
828
In [44]:
sc.pp.scale(adata_cc, max_value=10)
In [45]:
sc.tl.pca(adata_cc, svd_solver='arpack')
computing PCA with n_comps = 50
computing PCA on highly variable genes
    finished (0:00:00)
In [46]:
sc.pl.pca_variance_ratio(adata_cc)# log = True
In [47]:
sc.pl.pca_loadings(adata_cc, components=list(range(0,7)))
In [48]:
sc.pp.neighbors(adata_cc)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    'distances', distances for each pair of neighbors
    'connectivities', weighted adjacency matrix (0:00:00)
In [49]:
sc.tl.umap(adata_cc)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:02)
In [50]:
# cell cycle scoring
cell_cycle_genes = [x.strip() for x in open('/data/10x_data/cell_cycle_vignette_files/regev_lab_cell_cycle_genes_mouse.txt')]
s_genes = cell_cycle_genes[:43]
g2m_genes = cell_cycle_genes[43:]
cell_cycle_genes = [x for x in cell_cycle_genes if x in adata_cc.var_names]
sc.tl.score_genes_cell_cycle(adata_cc, s_genes=s_genes, g2m_genes=g2m_genes)
calculating cell cycle phase
computing score 'S_score'
WARNING: gene: Mlf1ip is not in adata.var_names and will be ignored
    finished: added
    'S_score', score of gene set (adata.obs) (0:00:00)
computing score 'G2M_score'
    finished: added
    'G2M_score', score of gene set (adata.obs) (0:00:00)
-->     'phase', cell cycle phase (adata.obs)
In [51]:
sc.pl.umap(adata_cc, color=['cell_type','phase','G2M_score','S_score'], edges = False)
... storing 'phase' as categorical
In [52]:
sc.pl.umap(adata_cc, color=['Plac8','Icos','Izumo1r','Itm2a','Lef1','Tmem176a','Emb','Klrb1c','Il2rb'], edges = False)